import pandas as pd
import numpy as np
import plotly.express as px
from scipy import stats
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("cookie_cats.csv")
df
| userid | version | sum_gamerounds | retention_1 | retention_7 | |
|---|---|---|---|---|---|
| 0 | 116 | gate_30 | 3 | False | False |
| 1 | 337 | gate_30 | 38 | True | False |
| 2 | 377 | gate_40 | 165 | True | False |
| 3 | 483 | gate_40 | 1 | False | False |
| 4 | 488 | gate_40 | 179 | True | True |
| ... | ... | ... | ... | ... | ... |
| 90184 | 9999441 | gate_40 | 97 | True | False |
| 90185 | 9999479 | gate_40 | 30 | False | False |
| 90186 | 9999710 | gate_30 | 28 | True | False |
| 90187 | 9999768 | gate_40 | 51 | True | False |
| 90188 | 9999861 | gate_40 | 16 | False | False |
90189 rows × 5 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 90189 entries, 0 to 90188 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 userid 90189 non-null int64 1 version 90189 non-null object 2 sum_gamerounds 90189 non-null int64 3 retention_1 90189 non-null bool 4 retention_7 90189 non-null bool dtypes: bool(2), int64(2), object(1) memory usage: 2.2+ MB
df.describe()
| userid | sum_gamerounds | |
|---|---|---|
| count | 9.018900e+04 | 90189.000000 |
| mean | 4.998412e+06 | 51.872457 |
| std | 2.883286e+06 | 195.050858 |
| min | 1.160000e+02 | 0.000000 |
| 25% | 2.512230e+06 | 5.000000 |
| 50% | 4.995815e+06 | 16.000000 |
| 75% | 7.496452e+06 | 51.000000 |
| max | 9.999861e+06 | 49854.000000 |
pie_data = df[['userid', 'version']].groupby('version').count().reset_index()
print(pie_data)
# Creating a pie chart
fig = px.pie(pie_data, values='userid', names='version',
title='Distribution of Versions',
labels={'userid': 'Number of Users', 'version': 'Version'})
fig.show()
version userid 0 gate_30 44700 1 gate_40 45489
# Calculating Z-scores for 'sum_gamerounds'
z_scores = stats.zscore(df['sum_gamerounds'])
# Trying different threshold values and evaluate the number of outliers for each
thresholds_to_try = [2, 2.5, 3, 3.5, 4]
outliers_count = []
for threshold in thresholds_to_try:
outliers = df[abs(z_scores) > threshold]
outliers_count.append(len(outliers))
# Plotting the effect of different thresholds on the number of outliers
fig = go.Figure(data=[go.Bar(x=thresholds_to_try, y=outliers_count)])
fig.update_layout(
title='Effect of Different Thresholds on Outliers Count',
xaxis_title='Threshold',
yaxis_title='Number of Outliers'
)
fig.show()
# Threshold values to try
threshold_values = [2, 2.5, 3, 3.5, 4]
# Calculating Z-scores for different thresholds and create new dataframes without outliers
dfs_no_outliers = {} # Dictionary to store dataframes without outliers for different thresholds
for threshold in threshold_values:
z_scores = stats.zscore(df['sum_gamerounds'])
outliers = df[abs(z_scores) > threshold]
df_no_outlier = df[abs(z_scores) <= threshold]
dfs_no_outliers[f"df_no_outlier_threshold_{threshold}"] = df_no_outlier
# Creating box plots for each dataframe without outliers (for each threshold)
for key, df_no_outlier in dfs_no_outliers.items():
fig = px.box(df_no_outlier, y='sum_gamerounds', title=f'Box Plot without Outliers (Threshold: {key[-3:]})')
fig.show()
# Calculating Z-scores for 'sum_gamerounds'
z_scores = stats.zscore(df['sum_gamerounds'])
# Defining a threshold for outliers (Z-score greater than 3)
threshold = 3
# Identifying outliers
outliers = df[abs(z_scores) > threshold]
# Creating plot to see outliers
fig = px.box(df, y='sum_gamerounds', title='Identifying Outliers with Z-score')
fig.add_scatter(x=outliers.index, y=outliers['sum_gamerounds'], mode='markers',
marker=dict(color='red', size=10), name='Outliers')
fig.show()
# Outliers DataFrame
print("Outliers:")
print(outliers)
Outliers:
userid version sum_gamerounds retention_1 retention_7
601 63617 gate_30 902 True True
655 69927 gate_30 1906 True True
865 97308 gate_30 798 True True
1097 121303 gate_30 1374 True True
1264 139072 gate_40 681 False True
... ... ... ... ... ...
88328 9791599 gate_40 2063 True True
88354 9794383 gate_40 846 True True
88590 9822327 gate_40 768 True True
89719 9949589 gate_40 708 True True
89921 9971042 gate_30 892 True True
[425 rows x 5 columns]
# Creating a new dataframe called cleaned_df
z_scores = stats.zscore(df['sum_gamerounds'])
# Defining a threshold for outliers (Z-score greater than 3)
threshold = 3
# Identifying outliers
outliers = df[abs(z_scores) > threshold]
# Creating a new DataFrame without outliers
cleaned_df = df[abs(z_scores) <= threshold]
print("Cleaned DataFrame without outliers (Threshold=3):")
print(cleaned_df)
Cleaned DataFrame without outliers (Threshold=3):
userid version sum_gamerounds retention_1 retention_7
0 116 gate_30 3 False False
1 337 gate_30 38 True False
2 377 gate_40 165 True False
3 483 gate_40 1 False False
4 488 gate_40 179 True True
... ... ... ... ... ...
90184 9999441 gate_40 97 True False
90185 9999479 gate_40 30 False False
90186 9999710 gate_30 28 True False
90187 9999768 gate_40 51 True False
90188 9999861 gate_40 16 False False
[89764 rows x 5 columns]
pie_data = cleaned_df[['userid', 'version']].groupby('version').count().reset_index()
print(pie_data)
# Creating a pie chart
fig = px.pie(pie_data, values='userid', names='version',
title='Distribution of Versions WITH CLEANED DATA',
labels={'userid': 'Number of Users', 'version': 'Version'})
fig.show()
version userid 0 gate_30 44500 1 gate_40 45264
plot_df = cleaned_df.groupby("sum_gamerounds")["userid"].count().reset_index(name='count')
print(plot_df)
# Distribution of players that played 0 to 100 game rounds
fig = px.bar(plot_df.head(100), x='sum_gamerounds', y='count',
labels={'sum_gamerounds': 'total gamerounds', 'count': 'number of players'},
title="<b>DISTRUBUTION OF PLAYERS</b>",
)
fig.update_layout(
xaxis=dict(title=dict(text="Total Gamerounds", font=dict(size=12))),
yaxis=dict(title=dict(text="#Number of Players", font=dict(size=12)))
)
fig.show()
sum_gamerounds count 0 0 3994 1 1 5538 2 2 4606 3 3 3958 4 4 3629 .. ... ... 629 633 3 630 634 1 631 635 1 632 636 2 633 637 3 [634 rows x 2 columns]
print("Players installed the game but then never played it:")
cleaned_df[cleaned_df["sum_gamerounds"]== 0]["userid"].count()
Players installed the game but then never played it:
3994
print("Day-1 Retention %")
retention_percentage = (cleaned_df['retention_1'].sum() / cleaned_df['retention_1'].count()) * 100
print(retention_percentage)
print("\nDay-7 Retention %")
retention_percentage = (cleaned_df['retention_7'].sum() / cleaned_df['retention_7'].count()) * 100
print(retention_percentage)
Day-1 Retention % 44.276101777995635 Day-7 Retention % 18.23670959404661
day_1_retention = cleaned_df.groupby('version')['retention_1'].mean() * 100
print("Day-1 Retention %")
print(day_1_retention)
day_7_retention = cleaned_df.groupby('version')['retention_7'].mean() * 100
print("\nDay-7 Retention %")
print(day_7_retention)
Day-1 Retention % version gate_30 44.593258 gate_40 43.964298 Name: retention_1, dtype: float64 Day-7 Retention % version gate_30 18.676404 gate_40 17.804436 Name: retention_7, dtype: float64
# Creating an list with bootstrapped means for each A/B group
bootstrap_1d = []
bootstrap_7d = []
for i in range(10000):
bootstrap_mean_1 = cleaned_df.sample(frac=1, replace=True).groupby('version')['retention_1'].mean()
bootstrap_mean_7 = cleaned_df.sample(frac=1, replace=True).groupby('version')['retention_7'].mean()
bootstrap_1d.append(bootstrap_mean_1)
bootstrap_7d.append(bootstrap_mean_7)
# Transforming the list to a DataFrame
bootstrap_1d = pd.DataFrame(bootstrap_1d)
bootstrap_7d = pd.DataFrame(bootstrap_7d)
# Kernel Density Estimate plot of the bootstrap distributions
fig, (ax1,ax2) = plt.subplots(1, 2, sharey=True, figsize=(12,6))
bootstrap_1d.plot.kde(ax=ax1)
ax1.set_xlabel("retention rate",size=12)
ax1.set_ylabel("number of sample",size=12)
ax1.set_title("1 day retention rate distribution", fontweight="bold",size=14)
bootstrap_7d.plot.kde(ax=ax2)
ax2.set_xlabel("retention rate",size=12)
ax2.set_title("7 days retention rate distribution", fontweight="bold",size=14)
plt.show()
# Adding a column with the % difference between the two A/B groups
bootstrap_1d['diff'] = ((bootstrap_1d['gate_30'] - bootstrap_1d['gate_40']) / bootstrap_1d['gate_40'] * 100)
bootstrap_7d['diff'] = ((bootstrap_7d['gate_30'] - bootstrap_7d['gate_40']) / bootstrap_7d['gate_40'] * 100)
# Ploting the bootstrap % difference
fig, (ax1) = plt.subplots(1, 1,figsize=(12,6))
bootstrap_1d['diff'].plot.kde(ax=ax1, c="#ff0000", label = "1 day retention")
bootstrap_7d['diff'].plot.kde(ax=ax1, c= "#0000ff", label = "7 days retention")
ax1.set_xlabel("% difference",size=12)
ax1.set_ylabel("% density",size=12)
ax1.set_title("Difference in retention \n between the two A/B groups", fontweight="bold", size=14)
plt.legend()
plt.show()
bootstrap_1d
| version | gate_30 | gate_40 | diff |
|---|---|---|---|
| retention_1 | 0.446200 | 0.440091 | 1.388130 |
| retention_1 | 0.441099 | 0.439577 | 0.346243 |
| retention_1 | 0.447878 | 0.437474 | 2.378187 |
| retention_1 | 0.447442 | 0.434079 | 3.078604 |
| retention_1 | 0.443168 | 0.441401 | 0.400290 |
| ... | ... | ... | ... |
| retention_1 | 0.445042 | 0.438820 | 1.417828 |
| retention_1 | 0.447340 | 0.436262 | 2.539445 |
| retention_1 | 0.446511 | 0.442517 | 0.902442 |
| retention_1 | 0.445566 | 0.439694 | 1.335542 |
| retention_1 | 0.448768 | 0.440738 | 1.822123 |
10000 rows × 3 columns
bootstrap_1d.describe()
| version | gate_30 | gate_40 | diff |
|---|---|---|---|
| count | 10000.000000 | 10000.000000 | 10000.000000 |
| mean | 0.445912 | 0.439648 | 1.427712 |
| std | 0.002340 | 0.002331 | 0.759106 |
| min | 0.436764 | 0.431514 | -1.484402 |
| 25% | 0.444361 | 0.438074 | 0.912329 |
| 50% | 0.445924 | 0.439651 | 1.430568 |
| 75% | 0.447466 | 0.441192 | 1.939760 |
| max | 0.454770 | 0.448981 | 4.045004 |
bootstrap_7d
| version | gate_30 | gate_40 | diff |
|---|---|---|---|
| retention_7 | 0.183915 | 0.178884 | 2.812501 |
| retention_7 | 0.182837 | 0.178442 | 2.462781 |
| retention_7 | 0.185701 | 0.177777 | 4.457648 |
| retention_7 | 0.187162 | 0.178738 | 4.712872 |
| retention_7 | 0.189298 | 0.177524 | 6.632363 |
| ... | ... | ... | ... |
| retention_7 | 0.189017 | 0.176595 | 7.034371 |
| retention_7 | 0.186712 | 0.178908 | 4.362087 |
| retention_7 | 0.186470 | 0.176773 | 5.485284 |
| retention_7 | 0.187181 | 0.178749 | 4.717266 |
| retention_7 | 0.185063 | 0.176845 | 4.647120 |
10000 rows × 3 columns
bootstrap_7d.describe()
| version | gate_30 | gate_40 | diff |
|---|---|---|---|
| count | 10000.000000 | 10000.000000 | 10000.000000 |
| mean | 0.186756 | 0.178055 | 4.897479 |
| std | 0.001828 | 0.001776 | 1.466559 |
| min | 0.180551 | 0.171000 | -0.147170 |
| 25% | 0.185505 | 0.176861 | 3.892644 |
| 50% | 0.186765 | 0.178057 | 4.887303 |
| 75% | 0.187984 | 0.179249 | 5.876139 |
| max | 0.193528 | 0.184260 | 11.262659 |
# Probability of 1-day retention is greater when the gate is at level 30
probability_1 = (bootstrap_1d['diff']>0).sum()/len(bootstrap_1d['diff'])
# Calculating of 7-days retention is greater when the gate is at level 30
probability_7 = (bootstrap_7d['diff']>0).sum()/len(bootstrap_7d['diff'])
print(f"The probability of 1-day retention is greater when the gate is at level 30: {(probability_1)*100}% \
\nThe probability of 7-days retention is greater when the gate is at level 30: {(probability_7)*100}% ")
The probability of 1-day retention is greater when the gate is at level 30: 97.0% The probability of 7-days retention is greater when the gate is at level 30: 99.98%
The likelihood of observing a higher 1-day retention when the gate is at level 30 stands at 96.88%. Correspondingly, the probability of a superior 7-day retention with the gate at level 30 is estimated at 99.97%.
As per the bootstrap analysis, there exists compelling evidence, accounting for 99.97% certainty, that 7-day retention tends to be higher when the gate is positioned at level 30 compared to level 40.
In summary, the inference drawn from these findings suggests that for maintaining high retention rates, both for 1-day and 7-day spans, it would be advisable to retain the gate at level 30 rather than shifting it to level 40. It's worth noting that while other metrics such as the quantity of game rounds played or the in-game purchase amounts of the two AB-groups may be considered, retention remains a pivotal metric.